add_phylopic(x=0.055, y=27, height=3, img=img_caurata, alpha=0.3)
tt4
# Añadir nombres de las siluetas y líneas
tt5 <- tt4 + annotate("text", x= 0.055, y= 30, size=3, label="italic(Oppiella~~nova)", parse=T) +
annotate("text", x= 0.055, y= 35, size=3, label="italic(Caenorhabditis~~elegans)", parse=T) +
annotate("text", x= 0.055, y= 25, size=3, label="italic(Cetonia~~aurata)", parse=T) +
annotate("segment", x = 0.0405, xend = 0.0496, y = 32, yend = 30, color="black", linewidth=0.5,
linetype="dashed") +
annotate("segment", x = 0.0405, xend = 0.0465, y = 32.9, yend = 35, color="black", linewidth=0.5,
linetype="dashed") +
annotate("segment", x = 0.0415, xend = 0.0496, y = 30.6, yend = 25, color="black", linewidth=0.5,
linetype="dashed")
tt5
tt5 <- tt4 + annotate("text", x= 0.055, y= 30, size=5, label="italic(Oppiella~~nova)", parse=T) +
annotate("text", x= 0.055, y= 35, size=5, label="italic(Caenorhabditis~~elegans)", parse=T) +
annotate("text", x= 0.055, y= 25, size=5, label="italic(Cetonia~~aurata)", parse=T) +
annotate("segment", x = 0.0405, xend = 0.0496, y = 32, yend = 30, color="black", linewidth=0.5,
linetype="dashed") +
annotate("segment", x = 0.0405, xend = 0.0465, y = 32.9, yend = 35, color="black", linewidth=0.5,
linetype="dashed") +
annotate("segment", x = 0.0415, xend = 0.0496, y = 30.6, yend = 25, color="black", linewidth=0.5,
linetype="dashed")
tt5
############  Creación de las leyendas  #############
# Países
d <- data.frame(x=rnorm(50), y=rnorm(50),
country=sample(c("ar", "au", "in", "es", "cn", "kr", "de", "jp", "cz", "us"),
50, TRUE),
stringsAsFactors = FALSE)
p_plot <- ggplot(d, aes(x=x, y=y, country=country)) + geom_flag() +
scale_country(labels=c("ar"="Argentina", "au"="Australia", "in"="India", "es"="Spain",
"cn"="China", "kr"="South Korea", "de"="Germany", "jp"="Japan",
"cz"="Czech Republic", "us"="USA")) +
guides(country = guide_legend(title = "Country")) +
theme(legend.title = element_text(size = 11),
legend.text = element_text(size = 9),
legend.background = element_rect(fill = "white", color = "grey90"))
p_plot
# Bootstrap
tt1_leyenda <- tt0 + new_scale_color() + geom_tree() +
geom_nodepoint(aes(color = factor(tt1_data$bootstrap)), size=2.5) +
scale_color_manual(values = "#00CD66", labels = ">= 70", na.translate=F) +
labs(color = "Bootstrap") +
theme(legend.title = element_text(size = 11),
legend.text = element_text(size = 9),
legend.background = element_rect(fill = "white", color = "grey90")) +
new_scale_fill()
tt1_leyenda
# Sample
tt3_leyenda <- gheatmap(tt0, metadata_sample_origin, width=0.2, offset=0.02, colnames=T, font.size=4,
custom_column_label="", hjust=0.5, color = "lightgrey") +
scale_fill_manual(values = custom_colors, name="Sample origin", na.translate=F,
limits=legend_order,
labels = c("Invertebrate", "Human", "Animal meat", "Plant",
"Environment", "Aquatic\nEnvironment",
"Contaminated\nEnvironment")) +
theme(legend.title = element_text(size = 11),
legend.text = element_text(size = 9),
legend.background = element_rect(fill = "white", color = "grey90")) +
guides(fill = guide_legend(label.position = "right",
label.hjust = 0,
label.vjust = 0.5,
keyheight = unit(1.75, "lines"))) +
new_scale_fill()
tt3_leyenda
# Unión de leyendas y figura
legend_countries <- get_legend(p_plot)
legend_bootstrap <- get_legend(tt1_leyenda)
legend_sample <- get_legend(tt3_leyenda)
legend_countries_plot <- ggplot() + theme_void() +
annotation_custom(legend_countries, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_bootstrap_plot <- ggplot() + theme_void() +
annotation_custom(legend_bootstrap, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_sample_plot <- ggplot() + theme_void() +
annotation_custom(legend_sample, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(tt5) +
draw_plot(legend_countries_plot, x = 0, y = 0.7, width = 0.22, height = 0.28) +
draw_plot(legend_bootstrap_plot, x = -0.002, y = 0.515, width = 0.1525, height = 0.3) +
draw_plot(legend_sample_plot, x = 0.172, y = 0.7, width = 0.22, height = 0.2755)
print(combined_plot)
# Bootstrap
tt1_leyenda <- tt0 + new_scale_color() + geom_tree() +
geom_nodepoint(aes(color = factor(tt1_data$bootstrap)), size=2.5) +
scale_color_manual(values = "#00CD66", labels = ">= 70", na.translate=F) +
labs(color = "Bootstrap") +
theme(legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
legend.background = element_rect(fill = "white", color = "grey90")) +
new_scale_fill()
tt1_leyenda
p_plot <- ggplot(d, aes(x=x, y=y, country=country)) + geom_flag() +
scale_country(labels=c("ar"="Argentina", "au"="Australia", "in"="India", "es"="Spain",
"cn"="China", "kr"="South Korea", "de"="Germany", "jp"="Japan",
"cz"="Czech Republic", "us"="USA")) +
guides(country = guide_legend(title = "Country")) +
theme(legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
legend.background = element_rect(fill = "white", color = "grey90"))
p_plot
# Sample
tt3_leyenda <- gheatmap(tt0, metadata_sample_origin, width=0.2, offset=0.02, colnames=T, font.size=4,
custom_column_label="", hjust=0.5, color = "lightgrey") +
scale_fill_manual(values = custom_colors, name="Sample origin", na.translate=F,
limits=legend_order,
labels = c("Invertebrate", "Human", "Animal meat", "Plant",
"Environment", "Aquatic\nEnvironment",
"Contaminated\nEnvironment")) +
theme(legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
legend.background = element_rect(fill = "white", color = "grey90")) +
guides(fill = guide_legend(label.position = "right",
label.hjust = 0,
label.vjust = 0.5,
keyheight = unit(1.75, "lines"))) +
new_scale_fill()
tt3_leyenda
# Unión de leyendas y figura
legend_countries <- get_legend(p_plot)
legend_bootstrap <- get_legend(tt1_leyenda)
legend_sample <- get_legend(tt3_leyenda)
legend_countries_plot <- ggplot() + theme_void() +
annotation_custom(legend_countries, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_bootstrap_plot <- ggplot() + theme_void() +
annotation_custom(legend_bootstrap, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_sample_plot <- ggplot() + theme_void() +
annotation_custom(legend_sample, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(tt5) +
draw_plot(legend_countries_plot, x = 0, y = 0.7, width = 0.22, height = 0.28) +
draw_plot(legend_bootstrap_plot, x = -0.002, y = 0.515, width = 0.1525, height = 0.3) +
draw_plot(legend_sample_plot, x = 0.172, y = 0.7, width = 0.22, height = 0.2755)
print(combined_plot)
combined_plot
######## RICKETSIALES PHYO ######
library(ggimage)
library(tidyverse)
library(ggrepel)
library(ggtree)
ricke_tree <- read.tree("/home/brown/Documentos/Proyectos/Elisa/ricke/23S_Ricke_1boot.mafft.treefile")
aso_file <- read.table("/home/brown/Documentos/Proyectos/Elisa/ricke/aso_sp_23S_def_host2.tsv", sep="\t", col.names = c("ID","Sp","tax","host"))
d <- ggimage::phylopic_uid(ricke_tree$tip.label)
genre_in <- c("Candidatus","Incertae Sedis")
aso_file$Genre <- ifelse(aso_file$Genre %in% genre_in, aso_file$Genre, NA)
aso_file <- aso_file %>% add_row(ID = "unk_Rick", Sp= "unk_Rick", tax= "unk_Rick", host = "Oppiella nova.jpg", Genre = "unknown Rick")
ricke_genre <- split(aso_file$ID,aso_file$Genre)
ricke_tree <- groupOTU(ricke_tree,ricke_genre)
ricke_tree$node.label <- as.integer(ricke_tree$node.label)
aso_images <- aso_file %>% select(c("ID","host"))
labels_host <- c("Acanthamoneba" ,"Amblyomma_variegatum" ,"Bemisia_tabaci" ,"Brugia_malayi" ,"cattle" ,"Cimex_lectularius" ,"Armadillidium_vulgare" ,"Culex_quinquefasciatus" ,"Dermacentor_variabilis" ,"Drosophila" ,"Drosophila_melanogaster", "Drosophila_simulans", "Dysmicoccus_sylvarum" ,"Folsomia_candida", "Homo_sapiens", "Ixodes_ricinus" ,"Laodelphax_striatella", "Mus_musculus", "Mycopsylla_fici","Oppiella_nova", "Onchocerca_ochengi", "Onchocerca_volvulus","Trichogramma_pretiosum","Amblyomma_neumanni")
#ricke_tree$display_label <- ifelse(ricke_tree$tip.label %in% labels_host, ricke_tree$tip.label, NA)
filtered_data <- subset(ricke_tree, ricke_tree$label %in% labels_host & isTip)
nodes_highlight <- data.frame(node_id = c(181,273,300,288,317,336), Genre = c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"))
library(glue)
d <- read.table("/home/brown/Documentos/Proyectos/Elisa/ricke/aso_id_host.tsv", sep="\t", col.names = c("tip_labels","host"))
d2 <- dplyr::mutate(d, lab = glue("italic({host})"))
phylo_tmp<- ggtree(ricke_tree, aes(color=factor(group, levels= c("Candidatus","Incertae Sedis","unknown Rick"))),layout = "rectangular" ,branch.length = "bootstrap",size = 1.3) +
scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"), na.value= "black")  + theme_tree2(legend.position = 'none')#+ geom_text(aes(label= node), hjust =-0.5 )
# SPECIES LABELS
phylo_labels <- phylo_tmp %<+% d2 + geom_label_repel(aes(label = lab), parse=T, size = 3, max.overlaps = 20, label.r = 0.2, xlim = c(0.5,1),ylim=c(10,140))
# COLLAPSE
phylo_labels<-  phylo_labels %>% collapse(node=190)
phylo_labels2 <- phylo_labels+ new_scale_color()
# BOOTSTRAP
nodes_parent <- c(172,173,14,285,286,287,317,336,300,175,176,177,277,283,281,178,179,180,181,272,182)
phylo_boot <- phylo_labels2 + geom_hilight(data=nodes_highlight, mapping= aes(node=node_id, fill=Genre), alpha =.4) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD")) +
geom_point2(aes(x=x, y=y, subset= as.numeric(label) >= 90),data=phylo_tmp$data[phylo_tmp$data$node %in% nodes_parent, ], color = "#66CD00", size =3) + theme(legend.position = 'none')#+ geom_nodepoint(aes(subset= label >= 90),color='#436EEE', size = 2.5)
phylopic_df <- data.frame(nodes = c(181,183,288,317), images = c("eb778df9-a98e-4b14-9aad-06ad8f9b2223","036b96de-4bca-408e-adb6-2154fcd724ef" ,"415714b4-859c-4d1c-9ce0-9e1081613df7","0cd6cc9f-683c-470e-a4a6-3b68beb826fa"))
ricke_data <- phylo_tmp$data
phylopic_df <- left_join(phylopic_df, ricke_data[, c("node", "x", "y")], by = c("nodes" = "node"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, image=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
### LEGENDS
data_1 <- data.frame(x= c("Candidatus","Incertae Sedis","unknown Rick"), y=c(1,2,3))
g1 <- ggplot(data_1, aes(x =x, y= 1, color=x)) + geom_line(linewidth = 1)  + scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"),labels = expression(italic("Candidatus"), italic("Incertae Sedis"), "unknown Rick"))+ theme_minimal() +theme(legend.box.background = element_rect(color="black",size=0.5), legend.title = element_blank())
data_2 <- data.frame(Genus=c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"), y= c(1,2,3,4,5,6))
g2 <- ggplot(data_2, aes(x = Genus, y = y, fill=Genus)) + geom_col(alpha=.5, color=NA) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD"), labels = expression(italic("Rickettsia"), italic("Orientia"),italic("Ehrlichia"),italic("Anaplasma"),italic("Wolbachia"),italic("Neorickettsia"))) + theme(legend.box.background = element_rect(color="black",size=0.5))
legend_g1 <- get_legend(g1)
ggplot_legend_g1 <- ggplot() + theme_void() +
annotation_custom(legend_g1, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_g2 <- get_legend(g2)
ggplot_legend_g2 <- ggplot() + theme_void() +
annotation_custom(legend_g2, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(phylo_def) +
draw_plot(ggplot_legend_g1, x = 0.03, y = 0.8, width = 0.22, height = 0.28) +
draw_plot(ggplot_legend_g2, x = 0.06, y = 0.615, width = 0.1525, height = 0.3)
phylopic_df <- data.frame(nodes = c(181,183,288,317), images = c("eb778df9-a98e-4b14-9aad-06ad8f9b2223","036b96de-4bca-408e-adb6-2154fcd724ef" ,"415714b4-859c-4d1c-9ce0-9e1081613df7","0cd6cc9f-683c-470e-a4a6-3b68beb826fa"))
ricke_data <- phylo_tmp$data
phylopic_df <- left_join(phylopic_df, ricke_data[, c("node", "x", "y")], by = c("nodes" = "node"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, image=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
### LEGENDS
data_1 <- data.frame(x= c("Candidatus","Incertae Sedis","unknown Rick"), y=c(1,2,3))
g1 <- ggplot(data_1, aes(x =x, y= 1, color=x)) + geom_line(linewidth = 1)  + scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"),labels = expression(italic("Candidatus"), italic("Incertae Sedis"), "unknown Rick"))+ theme_minimal() +theme(legend.box.background = element_rect(color="black",size=0.5), legend.title = element_blank())
data_2 <- data.frame(Genus=c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"), y= c(1,2,3,4,5,6))
g2 <- ggplot(data_2, aes(x = Genus, y = y, fill=Genus)) + geom_col(alpha=.5, color=NA) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD"), labels = expression(italic("Rickettsia"), italic("Orientia"),italic("Ehrlichia"),italic("Anaplasma"),italic("Wolbachia"),italic("Neorickettsia"))) + theme(legend.box.background = element_rect(color="black",size=0.5))
legend_g1 <- get_legend(g1)
ggplot_legend_g1 <- ggplot() + theme_void() +
annotation_custom(legend_g1, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_g2 <- get_legend(g2)
ggplot_legend_g2 <- ggplot() + theme_void() +
annotation_custom(legend_g2, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(phylo_def) +
draw_plot(ggplot_legend_g1, x = 0.03, y = 0.8, width = 0.22, height = 0.28) +
draw_plot(ggplot_legend_g2, x = 0.06, y = 0.615, width = 0.1525, height = 0.3)
phylopic_df <- data.frame(nodes = c(181,183,288,317), images = c("eb778df9-a98e-4b14-9aad-06ad8f9b2223","036b96de-4bca-408e-adb6-2154fcd724ef" ,"415714b4-859c-4d1c-9ce0-9e1081613df7","0cd6cc9f-683c-470e-a4a6-3b68beb826fa"))
ricke_data <- phylo_tmp$data
phylopic_df <- left_join(phylopic_df, ricke_data[, c("node", "x", "y")], by = c("nodes" = "node"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, image=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, images=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
?geom_phylopic
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, imageg=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
d <- read.table("/home/brown/Documentos/Proyectos/Elisa/ricke/aso_id_host.tsv", sep="\t", col.names = c("tip_labels","host"))
d2 <- dplyr::mutate(d, lab = glue("italic({host})"))
phylo_tmp<- ggtree(ricke_tree, aes(color=factor(group, levels= c("Candidatus","Incertae Sedis","unknown Rick"))),layout = "rectangular" ,branch.length = "bootstrap",size = 1.3) +
scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"), na.value= "black")  + theme_tree2(legend.position = 'none')#+ geom_text(aes(label= node), hjust =-0.5 )
# SPECIES LABELS
phylo_labels <- phylo_tmp %<+% d2 + geom_label_repel(aes(label = lab), parse=T, size = 3, max.overlaps = 20, label.r = 0.2, xlim = c(0.5,1),ylim=c(10,140))
# COLLAPSE
phylo_labels<-  phylo_labels %>% collapse(node=190)
phylo_labels2 <- phylo_labels+ new_scale_color()
# BOOTSTRAP
nodes_parent <- c(172,173,14,285,286,287,317,336,300,175,176,177,277,283,281,178,179,180,181,272,182)
phylo_boot <- phylo_labels2 + geom_hilight(data=nodes_highlight, mapping= aes(node=node_id, fill=Genre), alpha =.4) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD")) +
geom_point2(aes(x=x, y=y, subset= as.numeric(label) >= 90),data=phylo_tmp$data[phylo_tmp$data$node %in% nodes_parent, ], color = "#66CD00", size =3) + theme(legend.position = 'none')#+ geom_nodepoint(aes(subset= label >= 90),color='#436EEE', size = 2.5)
phylopic_df <- data.frame(nodes = c(181,183,288,317), images = c("eb778df9-a98e-4b14-9aad-06ad8f9b2223","036b96de-4bca-408e-adb6-2154fcd724ef" ,"415714b4-859c-4d1c-9ce0-9e1081613df7","0cd6cc9f-683c-470e-a4a6-3b68beb826fa"))
ricke_data <- phylo_tmp$data
phylopic_df <- left_join(phylopic_df, ricke_data[, c("node", "x", "y")], by = c("nodes" = "node"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, image=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, img=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
### LEGENDS
data_1 <- data.frame(x= c("Candidatus","Incertae Sedis","unknown Rick"), y=c(1,2,3))
g1 <- ggplot(data_1, aes(x =x, y= 1, color=x)) + geom_line(linewidth = 1)  + scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"),labels = expression(italic("Candidatus"), italic("Incertae Sedis"), "unknown Rick"))+ theme_minimal() +theme(legend.box.background = element_rect(color="black",size=0.5), legend.title = element_blank())
data_2 <- data.frame(Genus=c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"), y= c(1,2,3,4,5,6))
g2 <- ggplot(data_2, aes(x = Genus, y = y, fill=Genus)) + geom_col(alpha=.5, color=NA) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD"), labels = expression(italic("Rickettsia"), italic("Orientia"),italic("Ehrlichia"),italic("Anaplasma"),italic("Wolbachia"),italic("Neorickettsia"))) + theme(legend.box.background = element_rect(color="black",size=0.5))
legend_g1 <- get_legend(g1)
ggplot_legend_g1 <- ggplot() + theme_void() +
annotation_custom(legend_g1, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_g2 <- get_legend(g2)
ggplot_legend_g2 <- ggplot() + theme_void() +
annotation_custom(legend_g2, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(phylo_def) +
draw_plot(ggplot_legend_g1, x = 0.03, y = 0.8, width = 0.22, height = 0.28) +
draw_plot(ggplot_legend_g2, x = 0.06, y = 0.615, width = 0.1525, height = 0.3)
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, uuid=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
### LEGENDS
data_1 <- data.frame(x= c("Candidatus","Incertae Sedis","unknown Rick"), y=c(1,2,3))
phylo_def
######## RICKETSIALES PHYO ######
library(ggimage)
d <- ggimage::phylopic_uid(ricke_tree$tip.label)
#aso_file$ID <- substr(aso_file$ID,0,10)
aso_file$Genre <- sapply(strsplit(as.character(aso_file$tax), ";"), `[`, 6) # call the indexing function to get the first word from the list of split words
aso_file$Genre <- ifelse(grepl("Candidatus",aso_file$Sp), "Candidatus", aso_file$Genre)
genre_in <- c("Candidatus","Incertae Sedis")
aso_file$Genre <- ifelse(aso_file$Genre %in% genre_in, aso_file$Genre, NA)
aso_file <- aso_file %>% add_row(ID = "unk_Rick", Sp= "unk_Rick", tax= "unk_Rick", host = "Oppiella nova.jpg", Genre = "unknown Rick")
ricke_genre <- split(aso_file$ID,aso_file$Genre)
ricke_tree <- groupOTU(ricke_tree,ricke_genre)
ricke_tree$node.label <- as.integer(ricke_tree$node.label)
aso_images <- aso_file %>% select(c("ID","host"))
labels_host <- c("Acanthamoneba" ,"Amblyomma_variegatum" ,"Bemisia_tabaci" ,"Brugia_malayi" ,"cattle" ,"Cimex_lectularius" ,"Armadillidium_vulgare" ,"Culex_quinquefasciatus" ,"Dermacentor_variabilis" ,"Drosophila" ,"Drosophila_melanogaster", "Drosophila_simulans", "Dysmicoccus_sylvarum" ,"Folsomia_candida", "Homo_sapiens", "Ixodes_ricinus" ,"Laodelphax_striatella", "Mus_musculus", "Mycopsylla_fici","Oppiella_nova", "Onchocerca_ochengi", "Onchocerca_volvulus","Trichogramma_pretiosum","Amblyomma_neumanni")
#ricke_tree$display_label <- ifelse(ricke_tree$tip.label %in% labels_host, ricke_tree$tip.label, NA)
filtered_data <- subset(ricke_tree, ricke_tree$label %in% labels_host & isTip)
nodes_highlight <- data.frame(node_id = c(181,273,300,288,317,336), Genre = c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"))
library(glue)
d <- read.table("/home/brown/Documentos/Proyectos/Elisa/ricke/aso_id_host.tsv", sep="\t", col.names = c("tip_labels","host"))
d2 <- dplyr::mutate(d, lab = glue("italic({host})"))
phylo_tmp<- ggtree(ricke_tree, aes(color=factor(group, levels= c("Candidatus","Incertae Sedis","unknown Rick"))),layout = "rectangular" ,branch.length = "bootstrap",size = 1.3) +
scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"), na.value= "black")  + theme_tree2(legend.position = 'none')#+ geom_text(aes(label= node), hjust =-0.5 )
# SPECIES LABELS
phylo_labels <- phylo_tmp %<+% d2 + geom_label_repel(aes(label = lab), parse=T, size = 3, max.overlaps = 20, label.r = 0.2, xlim = c(0.5,1),ylim=c(10,140))
# COLLAPSE
phylo_labels<-  phylo_labels %>% collapse(node=190)
phylo_labels2 <- phylo_labels+ new_scale_color()
# BOOTSTRAP
nodes_parent <- c(172,173,14,285,286,287,317,336,300,175,176,177,277,283,281,178,179,180,181,272,182)
phylo_boot <- phylo_labels2 + geom_hilight(data=nodes_highlight, mapping= aes(node=node_id, fill=Genre), alpha =.4) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD")) +
geom_point2(aes(x=x, y=y, subset= as.numeric(label) >= 90),data=phylo_tmp$data[phylo_tmp$data$node %in% nodes_parent, ], color = "#66CD00", size =3) + theme(legend.position = 'none')#+ geom_nodepoint(aes(subset= label >= 90),color='#436EEE', size = 2.5)
phylopic_df <- data.frame(nodes = c(181,183,288,317), images = c("eb778df9-a98e-4b14-9aad-06ad8f9b2223","036b96de-4bca-408e-adb6-2154fcd724ef" ,"415714b4-859c-4d1c-9ce0-9e1081613df7","0cd6cc9f-683c-470e-a4a6-3b68beb826fa"))
ricke_data <- phylo_tmp$data
phylopic_df <- left_join(phylopic_df, ricke_data[, c("node", "x", "y")], by = c("nodes" = "node"))
#aso_file %<+% phylopic_df
phylo_def <- phylo_boot + geom_phylopic(data=phylopic_df,  aes(x = x-0.01, y=y+3, image=images), size = c(0.08,0.03,0.08,0.08), inherit.aes=FALSE, color = c(rep("coral1",2),"#B2DFEE","#8968CD"))
### LEGENDS
data_1 <- data.frame(x= c("Candidatus","Incertae Sedis","unknown Rick"), y=c(1,2,3))
g1 <- ggplot(data_1, aes(x =x, y= 1, color=x)) + geom_line(linewidth = 1) + scale_color_manual(values= c("#FFA319", "#155F83", "#EE2C2C"))+ theme_minimal() +theme(legend.box.background = element_rect(color="black",size=0.5), legend.title = element_blank())
data_2 <- data.frame(Genus=c("Rickettsia", "Orientia","Ehrlichia","Anaplasma","Wolbachia","Neorickettsia"), y= c(1,2,3,4,5,6))
g2 <- ggplot(data_2, aes(x = Genus, y = y, fill=Genus)) + geom_col(alpha=.5, color=NA) + scale_fill_manual(values =c("#B2DFEE", "#EE9A49","#CAFF70","gold","coral1","#8968CD")) + theme(legend.box.background = element_rect(color="black",size=0.5))
legend_g1 <- get_legend(g1)
ggplot_legend_g1 <- ggplot() + theme_void() +
annotation_custom(legend_g1, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
legend_g2 <- get_legend(g2)
ggplot_legend_g2 <- ggplot() + theme_void() +
annotation_custom(legend_g2, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf)
combined_plot <- ggdraw() + draw_plot(phylo_def) +
draw_plot(ggplot_legend_g1, x = 0.03, y = 0.8, width = 0.22, height = 0.28) +
draw_plot(ggplot_legend_g2, x = 0.06, y = 0.615, width = 0.1525, height = 0.3)
combinde_plot
setwd("~/Documentos/scripts/aba_defensome/data paper/grids ML")
# XGBoost Model of defense system classification using prophages
library(xgboost)
library(pROC)
library(dplyr)
library(irr)          # to obtain kappa
library(caret)
rm(list=ls(all=T)) # clears workspace
set.seed(3)
# Loading input data
df <- read.table(file="matrix_90_ab_ml_freqmlst8_wored100_onlyph.tsv",header=T,sep="\t", row.names = 2)
df <- df[,-1]
type_list<-c("Cas","CBASS","Gabija","RosmerTA","R-M","Gao_Qat","PD-T4-5","PD-T7-5","Ssp") # Select types
roc_list <- list()
df_performance<- data.frame(matrix(nrow= 0, ncol=6))
colnames(df_performance) <- c("Types","Kappa","%CC","AUC","Sensibility","Especificity")
for (t in type_list){
print(t)
Pres <- df[grepl(t, df$types),]
Pres$types = as.integer(1)
Aus <- df[!grepl(t, df$types),]
Aus$types = as.integer(0)
data<- rbind(Pres,Aus)
# Split data in training and testing data
m <- sample.int(n=nrow(data), size=floor(.7*nrow(data)),replace = F)
traindata <- data[m,]
testdata <- data[-m,]
# Only predictors (features) as matrix
traindatax <- as.matrix(subset(traindata, select=-types))
# Objective variable (labels)
traindatay <- c(traindata$types); head(traindatay,10)
# Same in testing data
testdatax<-as.matrix(subset(testdata, select=-types))
testdatay<-c(testdata$types)
# # Grid search for hyperparameters optimization
#   gridxgb <- expand.grid(eta = c(1e-3, 1e-2, 1e-1), # Learning rate
#                          max_depth = c(1, 5, 10), # Max depth of the tree
#                          min_child_weight = c(1, 5), # Minimal number of requiered samples in each terminal node
#                          subsample = c(.5, 1, by=0.15), # Training subset percentage for each tree
#                          colsample_bytree= c(.5, 1, by=0.15), # Percentage of features for each tree
#                          gamma = c(1e-3, 1e-2, 1e-1,1, 5), # Regularization (loss reduction)
#                          lambda = c(0, 1e-3, 1e-2, 1e-1, 1, 10), # Regularization L2
#                          alpha = c( 0, 1e-3, 1e-2, 1e-1, 1, 10), # Regularization L1
# #
#                          CC = 0,
#                          optntrees = 0, # Save results
#                          minerror = 0) # Save results
#   ncombs <- 10000 # No. random combinations to try
#   nc <- sample(nrow(gridxgb), ncombs) # Random index
#   gridxgb <- gridxgb[nc, ] # Random hyperparameters
# #
#   if (t == "CAS" || t == 'Gabija'){ # Unbalanced data
#     ratio<- nrow(Aus)/nrow(Pres)
#     for(i in 1:ncombs){
#       fitxgb <- xgb.cv(data=traindatax, label=traindatay, nrounds = 2000, nthread = 10,
#                        metrics = "error", early_stopping_rounds = 10, nfold = 5,
#                        scale_pos_weight = ratio,
#                        verbose = F,
#                        objective = "binary:logistic",
#                        prediction = T,
#                        eta = gridxgb$eta[i],
#                        max_depth = gridxgb$max_depth[i],
#                        min_child_weight = gridxgb$min_child_weight[i],
#                        subsample = gridxgb$subsample[i],
#                        colsample_bytree = gridxgb$colsample_bytree[i],
#                        gamma = gridxgb$gamma[i],
#                        lambda = gridxgb$lambda[i],
#                        alpha = gridxgb$alpha[i])
# #
#       # Saving results
#       gridxgb$optntrees[i] <- fitxgb$best_iteration
#       gridxgb$minerror[i] <- min(fitxgb$evaluation_log$test_error_mean)
#       gridxgb$CC[i] <- 100 - round(min(fitxgb$evaluation_log$test_error_mean * 100))
# #
#       if (i %% 100 == 0){
#         print(i) # Print iteration number for monitoring the process
#       }
#     }
#    }else{
#     for(i in 1:ncombs){
#       fitxgb <- xgb.cv(data=traindatax, label=traindatay, nrounds = 2000, nthread = 10,
#                        metrics = "error", early_stopping_rounds = 10, nfold = 5,
#                        verbose = F,
#                        objective = "binary:logistic",
#                        prediction = T,
#                        eta = gridxgb$eta[i],
#                        max_depth = gridxgb$max_depth[i],
#                        min_child_weight = gridxgb$min_child_weight[i],
#                        subsample = gridxgb$subsample[i],
#                        colsample_bytree = gridxgb$colsample_bytree[i],
#                        gamma = gridxgb$gamma[i],
#                        lambda = gridxgb$lambda[i],
#                        alpha = gridxgb$alpha[i])
#
#       # Saving results
#       gridxgb$optntrees[i] <- fitxgb$best_iteration
#       gridxgb$minerror[i] <- min(fitxgb$evaluation_log$test_error_mean)
#       gridxgb$CC[i] <- 100 - round(min(fitxgb$evaluation_log$test_error_mean * 100))
#
#       if (i %% 100 == 0){
#         print(i) # Print iteration number for monitoring the process
#       }
#
#     }
#   }
# save(gridxgb, file = paste0("gridxgb_", t,".RData"))
load(file=paste0("gridxgb_",t,".RData")) # Loading hyperparameter grids
# Sorting the hyperparameter grids
gridxgb <- gridxgb[order(gridxgb$minerror, decreasing = F),]
gridxgb <- subset(gridxgb, minerror != 0)
besthppsxgb <- gridxgb[1,]
if (t=="CAS" || t== "Gabija"){
#Weight
ratio<- nrow(Aus)/nrow(Pres)
bestmodelxgb <- xgboost(
data = traindatax, label= traindatay,
nrounds = besthppsxgb$optntrees,
objective = "binary:logistic",
scale_pos_weight = ratio,
eta = besthppsxgb$eta,
max_depth = besthppsxgb$max_depth,
min_child_weight = besthppsxgb$min_child_weight,
subsample = besthppsxgb$subsample,
colsample_bytree = besthppsxgb$colsample_bytree,
gamma = besthppsxgb$gamma,
lambda = besthppsxgb$lambda,
alpha = besthppsxgb$alpha,
verbose = 1
)
prob<- predict(bestmodelxgb, testdatax)
pred <- ifelse(prob > 0.5, 1, 0)
}else{
bestmodelxgb <- xgboost(
data = traindatax, label= traindatay,
nrounds = besthppsxgb$optntrees,
objective = "binary:logistic",
eta = besthppsxgb$eta,
max_depth = besthppsxgb$max_depth,
min_child_weight = besthppsxgb$min_child_weight,
subsample = besthppsxgb$subsample,
colsample_bytree = besthppsxgb$colsample_bytree,
gamma = besthppsxgb$gamma,
lambda = besthppsxgb$lambda,
alpha = besthppsxgb$alpha,
verbose = 1
)
prob<- predict(bestmodelxgb, testdatax)
pred <- ifelse(prob > 0.5, 1, 0)
}
# Target variable
defsys_type <- testdatay
# Convert to factor with consistent levels
pred_factor <- factor(pred, levels = c(0,1))
obs_factor <- factor(defsys_type, levels = c(0,1))
# Generate confusion matrix using caret
cm <- confusionMatrix(pred_factor, obs_factor, positive = "1")
# Extract metrics
acc <- round(cm$overall['Accuracy'], 2)
kappa <- round(cm$overall['Kappa'], 2)
precision <- round(cm$byClass['Precision'], 2)
recall <- round(cm$byClass['Recall'], 2)
f1_score <- round(cm$byClass['F1'], 2)
sensitivity <- round(cm$byClass['Sensitivity'], 2)
specificity <- round(cm$byClass['Specificity'], 2)
# Calculate ROC and AUC
roc <- roc(response = defsys_type, predictor = prob, levels=c("0", "1"), quiet = TRUE)
AUC <- round(auc(roc), 2); AUC
# Roc performance curves
# 2. Get ggplot version of the ROC curve
roc_plot <- ggroc(roc, color = "black", size = 1.2) +
ggtitle(paste("ROC Curve ", t, "(AUC = ",AUC, ")" )) +
theme_minimal()
roc_list <- append(roc_list, assign(t,roc_plot))
new_performance <- c(t,kappa,acc,AUC,precision,recall, f1_score)
df_performance <- rbind(df_performance,new_performance)
model_data <- list(
model = bestmodelxgb,
pres = Pres
)
model_name <- paste0("xgb_model_defensome_",t,".rds") # RDS is a R-specific format that preserves all components of the model
saveRDS(model_data, model_name)
}
# Put all barplots together
all_roc_plot <- plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=5, nrow=2)
pdf("ROC_curves.pdf", height= 8, width = 18)
plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=5, nrow=2)
# Put all barplots together
all_roc_plot <- plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=3, nrow=3)
pdf("ROC_curves.pdf", height= 8, width = 18)
pdf("ROC_curves.pdf", height= 8, width = 18)
plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=5, nrow=2)
dev.off()
pdf("ROC_curves.pdf", height= 8, width = 18)
plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=5, nrow=2)
dev.off()
# Put all barplots together
all_roc_plot <- plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=3, nrow=3)
pdf("ROC_curves.pdf", height= 8, width = 18)
plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=3, nrow=3)
dev.off()
pdf("ROC_curves.pdf", height= 12, width = 18)
plot_grid(Cas, CBASS, Gao_Qat, `R-M`, `PD-T4-5`, `PD-T7-5`, Gabija, RosmerTA, Ssp,ncol=3, nrow=3)
dev.off()
library(ggplot2)
library(dplyr)
library(tidyverse)
# Loading data
mlst_file <- read.table(file="mlst_ab_freq_wored100.tsv",header=FALSE,sep="\t", col.names = c("ID","mlst"))
subregion_file <- read.table(file="metadata_ab_subregions.tsv", header = TRUE, sep = "\t", quote = ",") # Metadata file subtracted from Suppl. Table S1
